import warnings
warnings.filterwarnings("ignore")
import time
import random
from math import *
import operator
import pandas as pd
import numpy as np
from scipy import stats
# import plotting libraries
import matplotlib
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
%matplotlib inline
import seaborn as sns
sns.set(style="white", color_codes=True)
sns.set(font_scale=1.5)
# load make_blobs to simulate data
from sklearn.datasets import make_blobs
from sklearn.datasets import make_classification
# import the ML algorithm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import explained_variance_score
from sklearn.preprocessing import PolynomialFeatures
from statsmodels.tools.eval_measures import rmse
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from sklearn import metrics
import statsmodels.api as sm
import statsmodels
import statsmodels.formula.api as smf
import os
import statistics
# pre-processing:
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing.data import QuantileTransformer
from sklearn.preprocessing import Imputer
# import libraries for model validation
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
# import libraries for metrics and reporting
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
import xgboost
import math
from scipy.stats import pearsonr
from sklearn.model_selection import GridSearchCV
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import mean_squared_error
from pandas.plotting import scatter_matrix
df_house=pd.read_csv("C://Users//disoj//Desktop//Data Science Class//Simplilearn//SL Machine Learning Classes//Machine Learning Project solutions//house_price.csv")
df_house.shape
df_house.head()
df_house.describe()
# Checking missing values
df_house.isnull().sum()
mode=statistics.mode(df_house['total_bedrooms'])
mode
# Filling missing 'total_bedrooms' values with mode since mean doesn't make sense for total bedrooms:
df_house['total_bedrooms']=df_house['total_bedrooms'].fillna(df_house['total_bedrooms'].mode()[0])
# Again Checking missing values
df_house.isnull().sum()
df_house.dtypes
df_house['ocean_proximity'].unique()
df_house["ocean_proximity"] = df_house["ocean_proximity"].astype('category')
df_house.dtypes
df_nhouse = pd.get_dummies(df_house, prefix_sep='-',drop_first=True,columns=["ocean_proximity"])
df_nhouse.head()
df_nhouse.dtypes
# Lets apply standard Scaler in this case,unlike in Linear Regression
scaler=StandardScaler(copy=True,with_mean=True,with_std=True
).fit(df_nhouse)
rescaled_dfnhouse=scaler.transform(df_nhouse)
colnames = ['longitude','latitude','housing_median_age','total_rooms','total_bedrooms','population','households','median_income','median_house_price','ocean_prox_INLAND','ocean_prox_ISLAND','ocean_prox_NEAR BAY','ocean_prox_NEAR OCEAN']
df_newh= pd.DataFrame(rescaled_dfnhouse, columns=colnames)
df_newh.head()
## Correlations:
df_newh.corr()
# Generating the correlation heat-map
corrmat=df_newh.corr()
top_corr_features=corrmat.index
plt.figure(figsize=(20,20))
sns.heatmap( df_newh[top_corr_features].corr(),annot=True,cmap="RdYlGn");
# Histograms:
df_newh.hist(figsize=(15,15), xlabelsize = 10);
# visualize the relationship between the features and the response using scatterplots
sns.pairplot(df_newh, diag_kind='kde',plot_kws={'alpha':0.6,'s':80,'edgecolor':'k'},size=4)
# extract the data, into numpy array:
X_features =df_newh.drop(['median_house_price'],axis=1)
y_actual=df_newh['median_house_price']
X_features.head()
Correlations between features and target:
features=['longitude','latitude','housing_median_age','total_rooms','total_bedrooms','population','households','median_income','ocean_prox_INLAND','ocean_prox_ISLAND','ocean_prox_NEAR BAY','ocean_prox_NEAR OCEAN']
target=df_newh['median_house_price'].name
correlations = {}
for f in features:
data_temp = df_newh[[f,target]]
x1 = data_temp[f].values
x2 = data_temp[target].values
key = f + ' vs ' + target
correlations[key] = pearsonr(x1,x2)[0]
data_correlations = pd.DataFrame(correlations, index=['Value']).T
data_correlations.loc[data_correlations['Value'].abs().sort_values(ascending=False).index]
X_features.values
y_actual.values
y = df_newh.loc[:,['median_income','ocean_prox_INLAND',target]].sort_values(target, ascending=True).values
x = np.arange(y.shape[0])
We can see that the top 5 features are the most correlated features with the target "price" Let's plot the best 2 regressors jointly
plt.subplot(3,1,1)
plt.plot(x,y[:,0])
plt.title('median income and ocean_proximity_INLAND vs house price')
plt.ylabel('median_income')
plt.subplot(3,1,2)
plt.plot(x,y[:,1])
plt.ylabel('ocean_prox_INLAND')
plt.subplot(3,1,3)
plt.plot(x,y[:,2],'r')
plt.ylabel("house price")
plt.show()
# For each X_features, calculate VIF and save in dataframe
VIF = pd.DataFrame()
VIF["VIF_Factor"] = [variance_inflation_factor(X_features.values, i) for i in range(X_features.shape[1])]
VIF["feature"] = X_features.columns
VIF.sort_values(['VIF_Factor'], ascending=False).round(5)
Lets drop the variables with VIF>3.5:
df_final =df_newh.drop(['population','total_bedrooms','households','total_rooms','longitude','latitude'],axis=1)
df_final.head()
# 2 Again applying feature_importance technique for the further feature selection:
X_train,X_test,y_train,y_test=train_test_split(X_features,y_actual,test_size=0.20,random_state=101)
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)
df_final.shape
#Checking Outliers:
plt.figure(figsize=(25,15))
boxplot=df_final.boxplot(patch_artist=True)
# Calculation of IQR:
Q1=df_final.quantile(0.25)
Q3=df_final.quantile(0.75)
IQR=Q3-Q1
print(IQR)
# IQR for 'median_income', detecting outlier with IQR, TRUE=outliers, FALSE=valid data
print(df_final<(Q1-1.5*IQR))
(df_final>(Q3+1.5*IQR))
# Removing Outliers:
df_out = df_final[~((df_final < (Q1 - 1.5 * IQR)) |(df_final > (Q3 + 1.5 * IQR))).any(axis=1)]
df_out.shape
df_out.head()
df_final.shape
# Lets see the visualization one more time:
# visualize the relationship between the features and the response using scatterplots
sns.pairplot(df_out, diag_kind='kde',plot_kws={'alpha':0.6,'s':80,'edgecolor':'k'},size=4)
#Checking Outliers:
plt.figure(figsize=(25,15))
boxplot=df_out.boxplot(patch_artist=True)
# extract further remaining data into numpy array:
X_out =df_out.drop(['median_house_price'],axis=1)
y_actual=df_out['median_house_price']
X_out.head()
# Splitting data:
X_train,X_test,y_train,y_test=train_test_split(X_out,y_actual,test_size=0.20,random_state=101)
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)
# New model without Outliers:
lin = LinearRegression()
# fit the model to the training data (learn the coefficients)
lin.fit(X_train, y_train)
# pair the feature names with the coefficients
list(zip(X_train, lin.coef_))
# make predictions on the testing set
y_pred = lin.predict(X_test)
print(y_pred)
# Fitting Polynomial Regression to the dataset
poly = PolynomialFeatures(degree = 2)
X_poly = poly.fit_transform(X_train)
poly.fit(X_poly, y_train)
lin2 = LinearRegression()
lin2.fit(X_poly, y_train)
list(zip(X_train, lin2.coef_))
X_train.shape
X_test.shape
poly = PolynomialFeatures(degree = 3)
X_poly = poly.fit_transform(X_train)
poly.fit(X_poly, y_train)
lin3 = LinearRegression()
lin3.fit(X_poly, y_train)
list(zip(X_train, lin3.coef_))
errors = abs(y_pred - y_test)
# Display the performance metrics
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
mape = np.mean(100 * (errors / y_test))
accuracy = 100 - mape
print('Accuracy:', round(accuracy, 2), '%.')
# Model evaluation metrics for regression(degree=1)
print('y-intercept : ', lin.intercept_)
print('beta coefficients : ', lin.coef_)
print('Mean Abs Error MAE : ', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Sq Error MSE : ', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Sq Error RMSE : ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('r2 value : ', metrics.r2_score(y_test, y_pred))
X_train = sm.add_constant(X_train) ## let's add an intercept (beta_0) to our model
X_test = sm.add_constant(X_test)
lm3 = sm.OLS(y_train,X_train).fit()
lm3.summary()
# fitted values (need a constant term for intercept)
model_fitted_y = lm3.fittedvalues
# model residuals
model_residuals = lm3.resid
# normalized residuals
model_norm_residuals = lm3.get_influence().resid_studentized_internal
# absolute squared normalized residuals
model_norm_residuals_abs_sqrt = np.sqrt(np.abs(model_norm_residuals))
# absolute residuals
model_abs_resid = np.abs(model_residuals)
# leverage, from statsmodels internals
model_leverage = lm3.get_influence().hat_matrix_diag
# cook's distance, from statsmodels internals
model_cooks = lm3.get_influence().cooks_distance[0]
# Checking the presence of heteroscedasticity:
name = ['Lagrange multiplier statistic', 'p-value',
'f-value', 'f p-value']
bp = statsmodels.stats.diagnostic.het_breuschpagan(lm3.resid, lm3.model.exog)
bp
pd.DataFrame(name, bp)
Tests are significant meaning that data violates the assumption of homoscedasticity, i.e. heteroscedasticity is present in the data.In another word, Since our p-value is less than 0.05, this indicates that heteroscedasticity is present, and we reject the null hypothesis of homoscedasticity.
stats.probplot(lm3.resid, plot= plt)
plt.title("Model1 Residuals Probability Plot");
the residuals which will be represented as dots (in blue) should fall on the red line. This plot indicates that the model’s residuals are not normally distributed.
Kolmogorov-Smirnov test (for normality)
stats.kstest(lm3.resid, 'norm')
The test is significant which indicates that the model’s residuals are not normally distributed. REJECT the null hypothesis (that the residuals are normally distributed).
Perform Linear Regression with one independent variable : • Extract just the median_income column from the independent variables (from X_train and X_test). • Perform Linear Regression to predict housing values based on median_income. • Predict output for test dataset using the fitted model. • Plot the fitted model for training data as well as for test data to check if the fitted model satisfies the test data.
df_house.head()
df_out.head()
X_medin =df_out.drop(['median_house_price','housing_median_age','ocean_prox_INLAND','ocean_prox_ISLAND','ocean_prox_NEAR BAY','ocean_prox_NEAR OCEAN'],axis=1)
y_actual=df_out['median_house_price']
X_medin.head()
print(X_medin.shape)
print(y_actual.shape)
# Splitting X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_medin, y_actual, random_state=1, test_size=0.2)
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)
plt.rcParams['figure.figsize'] = (8, 6)
plt.rcParams['font.size'] = 14
# Pandas scatter plot
df_out.plot(kind='scatter', x='median_income', y='median_house_price', alpha=0.2)
# Seaborn scatter plot with regression line
sns.lmplot(x='median_income', y='median_house_price', data=df_out, aspect=1.5, scatter_kws={'alpha':0.2})
linreg = LinearRegression()
# fit the model to the training data (learn the coefficients)
linreg.fit(X_train, y_train)
# print the coefficients
print(linreg.intercept_)
print(linreg.coef_)
list(zip(X_train, linreg.coef_))
# OR:
# visualize the relationship between the features and the response using scatterplots
plt.scatter(X_train, y_train, color = 'blue')
plt.plot(X_test, linreg.predict(X_test), color = 'red')
plt.title('Linear Regression')
plt.xlabel('median_income')
plt.ylabel('median_house_price')
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
poly_X= poly.fit_transform(X_train)
linreg2 = LinearRegression()
linreg2.fit(poly_X, y_train)
print(linreg2.intercept_)
print(linreg2.coef_)
list(zip(X_train, linreg2.coef_))
plt.scatter(X_train,y_train,color='red')
plt.plot(X_train, linreg2.predict(poly.fit_transform(X_train)), color='blue')
plt.show()
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=3)
poly_X= poly.fit_transform(X_train)
linreg3 = LinearRegression()
linreg3.fit(poly_X, y_train)
print(linreg3.intercept_)
print(linreg3.coef_)
list(zip(X_train, linreg3.coef_))
plt.scatter(X_train,y_train,color='red')
plt.plot(X_train, linreg3.predict(poly.fit_transform(X_train)), color='blue')
plt.show()
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=4)
poly_X= poly.fit_transform(X_train)
linreg4 = LinearRegression()
linreg4.fit(poly_X, y_train)
print(linreg4.intercept_)
print(linreg4.coef_)
list(zip(X_train, linreg4.coef_))
plt.scatter(X_train,y_train,color='red')
plt.plot(X_train, linreg4.predict(poly.fit_transform(X_train)), color='blue')
plt.show()
df_out.corr()
#Checking Outliers[Outliers have already been detected in first part]
plt.figure(figsize=(25,15))
boxplot=df_out.boxplot(patch_artist=True)
df_med=df_out.drop(['housing_median_age','ocean_prox_INLAND','ocean_prox_ISLAND','ocean_prox_NEAR BAY','ocean_prox_NEAR OCEAN'],axis=1)
df_med.head()
df_med.hist(figsize=(9,6), xlabelsize = 10);
y_pred=linreg.predict(X_test)
y_pred
# Model evaluation metrics for linear regression:
print('y-intercept : ', linreg.intercept_)
print('beta coefficients : ', linreg.coef_)
print('Mean Abs Error MAE : ', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Sq Error MSE : ', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Sq Error RMSE : ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('r2 value : ', metrics.r2_score(y_test, y_pred))
# Model evaluation metrics for polynomial (4th degree) regression:
print('y-intercept : ', linreg4.intercept_)
print('beta coefficients : ', linreg4.coef_)
print('Mean Abs Error MAE : ', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Sq Error MSE : ', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Sq Error RMSE : ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('r2 value : ', metrics.r2_score(y_test, y_pred))
model_f = 'median_house_price ~ median_income'
model = smf.ols(formula=model_f, data=df_med)
model_fit = model.fit()
# fitted values (need a constant term for intercept)
model_fitted_y = model_fit.fittedvalues
# model residuals
model_residuals = model_fit.resid
# normalized residuals
model_norm_residuals = model_fit.get_influence().resid_studentized_internal
# absolute squared normalized residuals
model_norm_residuals_abs_sqrt = np.sqrt(np.abs(model_norm_residuals))
# absolute residuals
model_abs_resid = np.abs(model_residuals)
# leverage, from statsmodels internals
model_leverage = model_fit.get_influence().hat_matrix_diag
# cook's distance, from statsmodels internals
model_cooks = model_fit.get_influence().cooks_distance[0]
plot_lm_1 = plt.figure(1)
plot_lm_1.set_figheight(6)
plot_lm_1.set_figwidth(9)
plot_lm_1.axes[0] = sns.residplot(model_fitted_y, 'median_house_price', data=df_med,
lowess=True,
scatter_kws={'alpha': 0.5},
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
plot_lm_1.axes[0].set_title('Residuals vs Fitted')
plot_lm_1.axes[0].set_xlabel('Fitted values')
plot_lm_1.axes[0].set_ylabel('Residuals')
# annotations
abs_resid = model_abs_resid.sort_values(ascending=False)
abs_resid_top_3 = abs_resid[:3]
for i in abs_resid_top_3.index:
plot_lm_1.axes[0].annotate(i,
xy=(model_fitted_y[i],
model_residuals[i]));
# Plot outputs (training_data vs test_data)
plt.scatter(X_test, y_test, color='black')
plt.plot(X_test, y_pred, color='blue', linewidth=3)
plt.xticks(())
plt.yticks(())
plt.show()
plt.scatter(X_train, y_train, color='black')
plt.plot(X_test, y_pred, color='blue', linewidth=3)
plt.xticks(())
plt.yticks(())
plt.show()
X_train = sma.add_constant(X_train) ## let's add an intercept (beta_0) to our model
X_test = sma.add_constant(X_test)
linreg = sm.OLS(y_train,X_train).fit()
linreg.summary()
R2 is surprisingly decresed after removing outliers, I will check it in further DT and RF classifier. Hopefully, Bagging and Bagstrapping in RF will help to enhance the model.
df_out.head()
print(X_medin.shape)
print(y_actual.shape)
# Splitting X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_medin, y_actual, random_state=1, test_size=0.2)
# Very Low the value of r-squared, so Let's try XGboost algorithm to see if we can get better results
xgb = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
colsample_bytree=1, max_depth=3)
xgb.fit(X_train,y_train)
predictions = xgb.predict(X_test)
print(explained_variance_score(predictions,y_test))
Not that much significant, may be I need to find out the best max_depth value and number of estimators in RF:
# instantiate the XGBregressor with different parameters:
xg_reg = XGBRegressor(objective ='reg:linear',
colsample_bytree = 0.3,
learning_rate = 0.1,
max_depth = 5,
alpha = 10,
n_estimators = 10)
xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)
print(explained_variance_score(preds,y_test))
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))
median_income is most important factor for the prediction of the houses, but we need to include other more variables in order to improve the model.
------END------